5.1 Continuous Variable Plots
5.1.5 Price vs. Year Built
# Scatter plot of Price vs. Year Built
ggplot(data = train_df_non_linear, aes(x = yr_built, y = price)) +
geom_point(pch = 20, col = "orange") +
labs(title = "Price vs. Year Built",
x = "Year Built",
y = "Price",
caption = generate_figure_caption("Price vs. Year Built", section = 5))

The scatter plot above compares price against the year
when homes were initially built (yr_built). This analysis
helps us understand how the age of a home relates to its sale price.
# Distribution of Year Built
ggplot(data = train_df_non_linear, aes(x = yr_built)) +
geom_histogram(bins = 50) +
labs(title = "Distribution of Year Built",
x = "Year Built",
y = "Density",
caption = generate_figure_caption("Distribution of Year Built", section = 5))

The histogram above displays the distribution of
yr_built. It provides insights into the distribution of
home ages in the dataset.
5.1.6 Price vs. Year of Last Renovation
Excluding homes that did not have a documented renovation.
# Find lowest non-zero year renovated
lowest_non_zero_renovation_year <- min(train_df_non_linear$yr_renovated[train_df_non_linear$yr_renovated > 0]) - 1
# Filter data for non-zero yr_renovated
filtered_data <- train_df_non_linear[train_df_non_linear$yr_renovated > 0,]
# Scatter plot of Price vs. Year Renovated
lowest_non_zero_renovation_year <- min(train_df_non_linear$yr_renovated[train_df_non_linear$yr_renovated > 0]) - 1
ggplot(data = filtered_data, aes(x = yr_renovated, y = price)) +
geom_point(pch = 20, col = "brown") +
labs(title = "Price vs. Year Renovated",
x = "Year Renovated",
y = "Price",
caption = generate_figure_caption("Price vs. Year Renovated (Non-Zero Values)", section = 5)) +
xlim(c(lowest_non_zero_renovation_year, max(train_df_non_linear$yr_renovated)))

In the scatter plot above, we compare price against the
year of the last renovation (yr_renovated). This analysis
helps us understand whether recent renovations impact home prices.
# Find lowest non-zero year renovated
lowest_non_zero_renovation_year <- min(train_df_non_linear$yr_renovated[train_df_non_linear$yr_renovated > 0]) - 1
# Filter data for non-zero yr_renovated
filtered_data <- train_df_non_linear[train_df_non_linear$yr_renovated > 0,]
# Histogram of Year Renovated
ggplot(data = filtered_data, aes(x = yr_renovated)) +
geom_histogram(fill = "orange") +
labs(title = "Histogram of Year Renovated",
x = "Year Renovated",
y = "Density",
caption = generate_figure_caption("Histogram of Year Renovated (Non-Zero Values)", section = 5)) +
xlim(c(lowest_non_zero_renovation_year, max(train_df_non_linear$yr_renovated)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The histogram above visualizes the distribution of
yr_renovated. It provides insights into the distribution of
renovation years in the dataset.
5.1.7 Price vs. Distance to Convergence
# Scatter plot of Price vs. Distance to Convergence
ggplot(data = train_df_non_linear, aes(x = distance_to_convergence, y = price)) +
geom_point(pch = 20, col = "violet") +
labs(title = "Price vs. Distance to Convergence",
x = "Distance to Convergence",
y = "Price",
caption = generate_figure_caption("Price vs. Distance to Convergence", section = 5))

The scatter plot above compares price against
distance_to_convergence. This analysis helps us explore
whether the distance to a convergence point impacts home prices.
# Distribution of Distance to Convergence
ggplot(data = train_df_non_linear, aes(x = distance_to_convergence)) +
geom_histogram(bins = 50) +
labs(title = "Distribution of Distance to Convergence",
x = "Distance to Convergence",
y = "Density",
caption = generate_figure_caption("Distribution of Distance to Convergence", section = 5))

5.2 Categorical Variable Analysis
The distribution and count of categorical variables such as
bedrooms, bathrooms, floors,
waterfront, view, condition, and
grade are analyzed.
5.2.1 Price vs. Bedrooms
# Convert bedrooms to factor
train_df_non_linear$bedrooms_factor <- factor(train_df_non_linear$bedrooms)
# Binned Boxplot of Price vs. Bedrooms
ggplot(data = train_df_non_linear, aes(x = bedrooms_factor, y = price)) +
geom_boxplot(fill = "blue") +
labs(title = "Price vs. Bedrooms",
x = "Bedrooms",
y = "Price",
caption = generate_figure_caption("Price vs. Bedrooms", section = 5))

The scatter plot above compares price against the number
of bedrooms. This visualization helps us understand how the
number of bedrooms influences home prices.
# Filter data excluding 33 bedrooms
filtered_bedrooms <- train_df_non_linear$bedrooms[train_df_non_linear$bedrooms != 33]
# Calculate frequencies of each bedroom count
bedroom_frequencies <- table(filtered_bedrooms)
ggplot(data = data.frame(filtered_bedrooms = as.factor(names(bedroom_frequencies)),
filtered_counts = as.numeric(bedroom_frequencies)),
aes(x = filtered_bedrooms, y = filtered_counts)) +
geom_bar(stat = "identity", fill = "blue") +
labs(title = "Distribution of Bedrooms (Excluding 33 Bedrooms)",
x = "Number of Bedrooms",
y = "Frequency",
caption = generate_figure_caption("Distribution of Bedrooms (Excluding 33 Bedrooms)", section = 5))

The bar plot above displays the distribution of the
bedrooms variable, showing the frequency of each bedroom
count.
5.2.2 Price vs. Bathrooms
# Convert bathrooms to factor
train_df_non_linear$bathrooms_factor <- factor(train_df_non_linear$bathrooms)
# Binned Boxplot of Price vs. Bathrooms
ggplot(data = train_df_non_linear, aes(x = bathrooms_factor, y = price)) +
geom_boxplot(fill = "green") +
labs(title = "Price vs. Bathrooms",
x = "Bathrooms",
y = "Price",
caption = generate_figure_caption("Price vs. Bathrooms", section = 5))

In the scatter plot above, we compare price against the
number of bathrooms. This analysis helps us explore the
relationship between the number of bathrooms and home prices.
# Get data for bar plot
bathrooms_counts <- table(train_df_non_linear$bathrooms)
bathrooms <- as.numeric(names(bathrooms_counts))
counts <- as.numeric(bathrooms_counts)
# Bar plot for the distribution of Bathrooms
ggplot(data = data.frame(bathrooms, counts), aes(x = bathrooms, y = counts)) +
geom_bar(stat = "identity", fill = "green") +
labs(title = "Distribution of Bathrooms",
x = "Number of Bathrooms",
y = "Frequency",
caption = generate_figure_caption("Distribution of Bathrooms", section = 5))

The bar plot above visualizes the distribution of the
bathrooms variable, showing the frequency of each bathroom
count.
5.2.3 Price vs. Floors
# Binned Boxplot of Price vs. Floors
ggplot(data = train_df_non_linear, aes(x = floors, y = price, group = floors)) +
geom_boxplot(fill = "orange") +
labs(title = "Price vs. Floors",
x = "Floors",
y = "Price",
caption = generate_figure_caption("Price vs. Floors", section = 5))

The scatter plot above compares price against the number
of floors. This analysis helps us understand how the number
of floors in a home relates to its sale price.
floors_counts <- table(train_df_non_linear$floors)
floors <- as.numeric(names(floors_counts))
counts <- as.numeric(floors_counts)
# Bar plot for the distribution of Floors
ggplot(data = data.frame(floors, counts), aes(x = floors, y = counts)) +
geom_bar(stat = "identity", fill = "orange") +
labs(title = "Distribution of Floors",
x = "Number of Floors",
y = "Frequency",
caption = generate_figure_caption("Distribution of Floors", section = 5))

The bar plot above displays the distribution of the
floors variable, showing the frequency of each floor
count.
5.2.4 Price vs. Waterfront
ggplot(data = train_df_non_linear, aes(x = waterfront_1, y = price, group = waterfront_1)) +
geom_boxplot(fill = "purple") +
labs(title = "Price vs. Waterfront",
x = "Waterfront",
y = "Price",
caption = generate_figure_caption("Price vs. Waterfront", section = 5),
fill = "Waterfront",
levels = c("No", "Yes")) # Labels for waterfront status

In the scatter plot above, we compare price against the
waterfront variable. This visualization helps us explore
how having a waterfront view impacts home prices.
# Get data for bar plot
waterfront_counts <- table(train_df_non_linear$waterfront_1)
waterfront <- as.numeric(names(waterfront_counts))
counts <- as.numeric(waterfront_counts)
# Bar plot for the distribution of Waterfront
ggplot(data = data.frame(waterfront, counts), aes(x = waterfront, y = counts)) +
geom_bar(stat = "identity", fill = "purple") +
labs(title = "Distribution of Waterfront",
x = "Waterfront",
y = "Frequency",
caption = generate_figure_caption("Distribution of Waterfront", section = 5),
fill = "Waterfront",
levels = c("No", "Yes")) # Labels for waterfront status

The bar plot above visualizes the distribution of the
waterfront variable, showing the frequency of waterfront
and non-waterfront properties.
5.2.5 Price vs. View
# Convert view categories from dummy variables to a factor for better labeling in ggplot
train_df_non_linear$view_category <- factor(apply(train_df_non_linear[, c("view_0", "view_1", "view_2", "view_3", "view_4")], 1, function(x) which(x == 1)),
labels = c("View 0", "View 1", "View 2", "View 3", "View 4"))
# Create the boxplot with ggplot2
ggplot(train_df_non_linear, aes(x = view_category, y = price)) +
geom_boxplot(fill = "brown") +
labs(title = "Price vs. View Quality",
x = "View Quality",
y = "Price",
caption = generate_figure_caption("Boxplot of Price vs. View Quality", section = 5))

The scatter plot above compares price against the
view variable, which represents the quality of the
property’s view. This analysis helps us explore how the view quality
impacts home prices.
# Calculate frequencies of each view quality rating
view_frequencies <- colSums(train_df_non_linear[, c("view_0", "view_1", "view_2", "view_3", "view_4")])
# Convert frequencies to data frame for ggplot2
view_df <- data.frame(View = names(view_frequencies), Frequency = view_frequencies)
# Create the bar plot with ggplot2
ggplot(view_df, aes(x = View, y = Frequency)) +
geom_bar(stat = "identity", fill = "purple") +
labs(title = "Distribution of View Quality",
x = "View Quality",
y = "Frequency",
caption = generate_figure_caption("Distribution of View Quality", section = 5))

The bar plot above displays the distribution of the view
variable, showing the frequency of different view quality ratings.
5.2.6 Price vs. Condition
# Convert condition categories from dummy variables to a factor
train_df_non_linear$condition_category <- factor(apply(train_df_non_linear[, c("condition_1", "condition_2", "condition_3", "condition_4", "condition_5")], 1, function(x) which(x == 1)),
labels = c("Condition 1", "Condition 2", "Condition 3", "Condition 4", "Condition 5"))
# Create the boxplot with ggplot2
ggplot(train_df_non_linear, aes(x = condition_category, y = price)) +
geom_boxplot(fill = "blue") +
labs(title = "Price vs. Condition",
x = "Condition",
y = "Price",
caption = generate_figure_caption("Boxplot of Price vs. Condition", section = 5))

In the scatter plot above, we compare price against the
condition variable, which represents the condition of the
property. This analysis helps us explore how property condition relates
to home prices.
# Calculate frequencies of each condition rating
condition_frequencies <- colSums(train_df_non_linear[, c("condition_1", "condition_2", "condition_3", "condition_4", "condition_5")])
# Convert frequencies to a data frame for ggplot2
condition_df <- data.frame(Condition = names(condition_frequencies), Frequency = condition_frequencies)
# Create the bar plot with ggplot2
ggplot(condition_df, aes(x = Condition, y = Frequency)) +
geom_bar(stat = "identity", fill = "green") +
labs(title = "Distribution of Condition",
x = "Condition Rating",
y = "Frequency",
caption = generate_figure_caption("Distribution of Condition", section = 5))

The bar plot above visualizes the distribution of the
condition variable, showing the frequency of different
condition ratings.
5.2.7 Price vs. Grade
# First, identify all grade-related columns in the dataframe
grade_columns <- grep("grade_", names(train_df_non_linear), value = TRUE)
# Convert dummy variables back to a single categorical variable representing the grade
train_df_non_linear$grade_category <- apply(train_df_non_linear[, grade_columns], 1, function(row) {
if (all(is.na(row))) {
return(NA) # Return NA if all values in the row are NA
} else {
idx <- which(row == 1, arr.ind = TRUE)
return(if(length(idx) > 0) idx else NA) # Return the index of the grade, or NA if none is 1
}
})
# Extract grade labels from column names, replacing underscores with hyphens for better readability
grade_labels <- sub("grade_", "", grade_columns) # Remove 'grade_' prefix
grade_labels <- gsub("_", "-", grade_labels) # Replace underscores with hyphens
# Create a boxplot of Price vs. Grade
ggplot(train_df_non_linear, aes(x = factor(grade_category, labels = grade_labels), y = price)) +
geom_boxplot(fill = "green") +
labs(title = "Price vs. Grade",
x = "Grade",
y = "Price",
caption = generate_figure_caption("Boxplot of Price vs. Grade", section = 5))

The scatter plot above compares price against the
grade variable, which has been aggregated into categories
as per the provided header. This analysis helps us explore how the grade
of construction and design impacts home prices.
# Histogram for the Distribution of Grade
# Convert the grade category to a numeric variable for histogram plotting
train_df_non_linear$grade_category_numeric <- as.numeric(train_df_non_linear$grade_category)
# Define breaks for histogram
num_breaks <- length(unique(train_df_non_linear$grade_category_numeric, na.rm = TRUE))
hist_breaks <- seq(min(train_df_non_linear$grade_category_numeric, na.rm = TRUE) - 0.5,
max(train_df_non_linear$grade_category_numeric, na.rm = TRUE) + 0.5,
length.out = num_breaks + 1)
# Create a histogram with ggplot2
ggplot(train_df_non_linear, aes(x = grade_category_numeric)) +
geom_histogram(fill = "purple", breaks = hist_breaks) +
scale_x_continuous(breaks = seq_along(grade_labels), labels = grade_labels) +
labs(title = "Distribution of Grade",
x = "Grade",
y = "Frequency",
caption = generate_figure_caption("Histogram of Distribution of Grade", section = 5))

The bar plot above displays the distribution of the
grade_category variable, showing the frequency of different
grade categories.
5.3 Correlation Analysis
Understanding how continuous variables correlate with each other and,
more importantly, with the target variable price.
5.3.1 Correlation Matrix
# Correlation Matrix of Numeric Variables
cor_matrix <- cor(train_df_non_linear[sapply(train_df_non_linear, is.numeric)])
# Create a table of sorted correlation values
cor_table <- as.data.frame(sort(cor_matrix[,"price"], decreasing = TRUE))
# Display the top 20 correlation values
top_20_corr <- cor_table[1:20, , drop = FALSE]
Table 5.1 Top 20 Correlation Values with Price
|
Variable
|
Correlation with Price
|
|
price
|
1.0000000
|
|
sqft_living
|
0.7020794
|
|
sqft_above
|
0.5998325
|
|
sqft_living15
|
0.5931358
|
|
bathrooms
|
0.5202595
|
|
grade_Above_Average
|
0.4714246
|
|
sqft_basement
|
0.3307377
|
|
lat
|
0.3122509
|
|
bedrooms
|
0.3051678
|
|
view_4
|
0.2997715
|
|
zipcode_98004
|
0.2748744
|
|
floors
|
0.2569215
|
|
waterfront_1
|
0.2455379
|
|
zipcode_98040
|
0.2082191
|
|
view_3
|
0.1908930
|
|
zipcode_98112
|
0.1810434
|
|
zipcode_98039
|
0.1711855
|
|
view_2
|
0.1472079
|
|
zipcode_98006
|
0.1363141
|
|
yr_renovated
|
0.1276562
|
5.3.2 Correlation Graphics Analysis
In the tables presented above, we’ve showcased the top 20 correlation
values concerning the target variable, price, with the
values sorted by their absolute magnitudes. Here are some crucial
observations from this analysis:
- Positive Correlations with Price:
- Variables like
sqft_living, sqft_above,
sqft_living15, and bathrooms exhibit robust
positive correlations with the target variable (price).
This implies that as these features increase, house prices tend to
increase correspondingly.
- Features such as
view_4 also demonstrate positive
correlations, indicating that properties with higher grades and better
views tend to command higher prices.
- Negative Correlations with Price:
- There are no negative correlations among the top 20 correlated
variables. This suggests that none of the examined features strongly
suggest a decrease in house price as they increase.
- Feature Importance:
- The strength of these correlations provides insights into the
importance of variables in predicting house prices. Variables like
sqft_living and grades emerge as strong predictors of
price.
- Location-related variables, such as
zipcode_98004,
zipcode_98039, and zipcode_98040, also exhibit
noteworthy positive correlations, underscoring the significance of
location in price determination.
5.3.3 Correlation Heatmap
# Heatmap of the top 20 correlation values
# Filter the top 20 correlation values
top_20_corr_variables <- rownames(top_20_corr)
top_20_corr_matrix <- cor_matrix[top_20_corr_variables, top_20_corr_variables]
# Create a heatmap
ggplot(melt(top_20_corr_matrix), aes(Var1, Var2, fill = value)) +
geom_tile() +
labs(title = "Top 20 Correlations",
x = "Variable",
y = "Variable",
caption = generate_figure_caption("Heatmap showing the top 20 correlations", section = 5)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

5.3.4 Correlation Matrix for Multicollinearity
# Selecting predictors and excluding the response variable 'price'
predictors <- dplyr::select(train_df_linear, -price)
# Convert factors to numeric
numeric_predictors <- predictors %>%
mutate(across(where(is.factor), as.numeric)) %>%
mutate(across(where(is.character), ~as.numeric(as.factor(.))))
# Calculate the correlation matrix
corr_matrix <- cor(numeric_predictors, use = "pairwise.complete.obs")
# Convert the correlation matrix to a long format
correlated_pairs_df <- melt(corr_matrix)
# Filter out redundant pairs (keep only lower triangle of the matrix)
correlated_pairs_df <- correlated_pairs_df %>%
filter(Var1 != Var2) %>% # Remove self-correlations
filter(abs(value) > 0.8) %>%
filter(match(Var1, rownames(corr_matrix)) < match(Var2, rownames(corr_matrix)))
# Rename columns for clarity
correlated_pairs_df <- correlated_pairs_df %>%
rename(Variable1 = Var1, Variable2 = Var2, Correlation = value)
# Output the table of highly correlated pairs using knitr::kable()
knitr_table <- kable(
correlated_pairs_df,
caption = generate_table_caption("Highly Correlated Variable Pairs", section = 5),
format = "markdown"
)
print(knitr_table)
Table 5.2 Highly Correlated Variable Pairs
| sqft_living |
sqft_above |
0.8744114 |
| month_sold |
week_of_year |
0.9955447 |
| month_sold |
day_of_year |
0.9958281 |
| week_of_year |
day_of_year |
0.9996951 |
| condition_3 |
condition_4 |
-0.8095157 |
| grade_Above_Average |
grade_Average |
-0.9954905 |
5.3.5 Detailed Explanation for Removal
5.3.5.1 sqft_above & sqft_living
The removal of sqft_above and sqft_living
is justified due to their high correlation coefficient of 0.8744114.
sqft_above represents the square footage of the living area
above ground, while sqft_living encompasses the total
square footage of living space. Since sqft_above is a
subset of sqft_living, it is likely to contain redundant
information, making it less valuable for our model.
5.3.5.2 month_sold & week_of_year
The variables month_sold and week_of_year
exhibit a remarkably high correlation coefficient of 0.9955447. These
variables are intrinsically correlated as they both pertain to the date
of the house sale. While day_of_year provides the most
detailed temporal information, retaining both week_of_year
and month_sold may lead to multicollinearity issues. It’s
advisable to consider removing one of these variables to mitigate
multicollinearity while preserving the most granular date-related
information.
5.3.5.3 month_sold & day_of_year
Similar to the previous case, month_sold and
day_of_year demonstrate a high correlation coefficient of
0.9958281. Both variables are related to the date of the house sale.
Given that day_of_year provides the most granular temporal
information, it may be preferred to retain it while considering the
removal of month_sold to address multicollinearity
concerns.
5.3.5.4 week_of_year &
day_of_year
The correlation coefficient of 0.9996951 between
week_of_year and day_of_year indicates an
extremely high correlation. Both variables are associated with the date
of sale. Given the granularity of day_of_year, retaining it
and potentially removing week_of_year can be a strategy to
reduce multicollinearity while retaining essential date-related
information.
5.3.5.5 condition_4 & condition_3
The variables condition_4 and condition_3
display a notable negative correlation coefficient of -0.8095157. These
variables are derived from the categorical variable indicating the
condition of the house. Through one-hot encoding, binary variables were
created for each condition. Since these conditions are mutually
exclusive, they exhibit a negative correlation. Consideration can be
given to keeping one condition as a reference group and discarding the
other, or reverting to using the original categorical variable to
effectively capture overall house condition.
5.3.5.6 grade_Above_Average &
grade_Average
The correlation coefficient of -0.9954905 between
grade_Above_Average and grade_Average
highlights a strong negative correlation. These variables represent
different grade categories of houses. Such a high correlation suggests
that retaining both variables may introduce multicollinearity into the
model. Decisions can be made to keep one of these variables as a
representative of house grade or explore alternative encoding
strategies.
By addressing the removal of these highly correlated variable pairs,
our primary goal is to mitigate multicollinearity issues.
Multicollinearity can distort regression coefficient estimates, inflate
standard errors, and potentially obscure the statistical significance of
predictors. The objective is to retain variables that provide unique and
informative contributions to the model’s prediction of house prices.
5.5 Temporal Trends Analysis
Analyzing the influence of time-related features such as
month, and season on house prices.
5.5.1 Monthly Trends in Average House Prices
# Monthly Trends in Average House Prices
monthly_trends <- aggregate(train_df_non_linear$price, by = list(train_df_non_linear$month_sold), FUN = mean)
colnames(monthly_trends) <- c("Month", "Average_Price")
# Find the global maximum
global_max <- monthly_trends[which.max(monthly_trends$Average_Price), ]
ggplot(monthly_trends, aes(x = Month, y = Average_Price)) +
geom_line() +
geom_text(data = global_max, aes(x = Month, y = Average_Price, label = paste("Global Max:", round(Average_Price, 2))), vjust = -0.5) +
labs(title = "Monthly Trends in Average House Prices", x = "Month", y = "Average Price", caption = generate_figure_caption("Monthly Trends in Average House Prices", section = 5)) +
scale_x_continuous(breaks = 1:12, labels = month.name) +
theme_minimal()

# Monthly Trends in Count of Homes Sold
monthly_counts <- table(train_df_non_linear$month_sold)
months <- factor(1:12, labels = month.name)
monthly_counts_df <- data.frame(Month = months, Count = as.numeric(monthly_counts))
# Find the global maximum
global_max_count <- monthly_counts_df[which.max(monthly_counts_df$Count), ]
ggplot(monthly_counts_df, aes(x = Month, y = Count, group = 1)) +
geom_bar(stat = "identity") +
geom_text(data = global_max_count, aes(x = Month, y = Count, label = paste("Global Max:", Count)), vjust = -0.5) + # Add label for global maximum
labs(title = "Monthly Trends in Count of Homes Sold", x = "Month", y = "Count of Homes Sold", caption = generate_figure_caption("Monthly Trends in Count of Homes Sold", section = 5)) +
theme_minimal()

5.5.2 Seasonal Trends in Average House Prices
# Aggregate average price for each season
seasonal_trends <- data.frame(
Season = c("Winter", "Spring", "Summer", "Fall"),
Average_Price = c(
mean(train_df_non_linear$price[train_df_non_linear$season_Winter == 1]),
mean(train_df_non_linear$price[train_df_non_linear$season_Spring == 1]),
mean(train_df_non_linear$price[train_df_non_linear$season_Summer == 1]),
mean(train_df_non_linear$price[train_df_non_linear$season_Fall == 1])
)
)
# Find the global maximum
global_max_seasonal <- seasonal_trends[which.max(seasonal_trends$Average_Price), ]
# Plotting
ggplot(seasonal_trends, aes(x = Season, y = Average_Price, fill = Season)) +
geom_bar(stat = "identity") +
geom_text(data = global_max_seasonal, aes(label = paste("Global Max:", round(Average_Price, 2)), y = Average_Price), vjust = -0.5) +
labs(title = "Seasonal Trends in Average House Prices", x = "Season", y = "Average Price", caption = generate_figure_caption("Seasonal Trends in Average House Prices", section = 5)) +
theme_minimal()

# Count homes sold for each season
seasonal_counts <- c(
sum(train_df_non_linear$season_Winter == 1),
sum(train_df_non_linear$season_Spring == 1),
sum(train_df_non_linear$season_Summer == 1),
sum(train_df_non_linear$season_Fall == 1)
)
seasonal_counts_df <- data.frame(Season = c("Winter", "Spring", "Summer", "Fall"), Count = seasonal_counts)
# Find the global maximum
global_max_count_seasonal <- seasonal_counts_df[which.max(seasonal_counts_df$Count), ]
# Plotting
ggplot(seasonal_counts_df, aes(x = Season, y = Count, fill = Season)) +
geom_bar(stat = "identity") +
geom_text(data = global_max_count_seasonal, aes(label = paste("Global Max:", Count), y = Count), vjust = -0.5) +
labs(title = "Seasonal Trends in Count of Homes Sold", x = "Season", y = "Count of Homes Sold", caption = generate_figure_caption("Seasonal Trends in Count of Homes Sold", section = 5)) +
theme_minimal()

5.5.3 Week of the Year Trends in Average House Prices
# Week of the Year Trends in Average House Prices
weekly_trends <- aggregate(train_df_non_linear$price, by = list(train_df_non_linear$week_of_year), FUN = mean)
colnames(weekly_trends) <- c("Week_of_Year", "Average_Price")
# Find the global maximum
global_max_weekly <- weekly_trends[which.max(weekly_trends$Average_Price), ]
ggplot(weekly_trends, aes(x = Week_of_Year, y = Average_Price)) +
geom_line() +
geom_text(data = global_max_weekly, aes(x = Week_of_Year, y = Average_Price, label = paste("Global Max:", round(Average_Price, 2))), vjust = -0.5) + # Add label for global maximum
labs(title = "Weekly Trends in Average House Prices", x = "Week of Year", y = "Average Price", caption = generate_figure_caption("Weekly Trends in Average House Prices", section = 5)) +
theme_minimal()

# Weekly Trends in Count of Homes Sold
weekly_counts <- table(train_df_non_linear$week_of_year)
weekly_counts_df <- data.frame(Week_of_Year = as.numeric(names(weekly_counts)), Count = as.numeric(weekly_counts))
# Find the global maximum
global_max_count_weekly <- weekly_counts_df[which.max(weekly_counts_df$Count), ]
ggplot(weekly_counts_df, aes(x = Week_of_Year, y = Count)) +
geom_bar(stat = "identity") +
geom_text(data = global_max_count_weekly, aes(x = Week_of_Year, y = Count, label = paste("Global Max:", Count)), vjust = -0.5) + # Add label for global maximum
labs(title = "Weekly Trends in Count of Homes Sold", x = "Week of Year", y = "Count of Homes Sold", caption = generate_figure_caption("Weekly Trends in Count of Homes Sold", section = 5)) +
theme_minimal()

5.5.4 Day of the Year Trends in Average House Prices
# Day of the Year Trends in Average House Prices
daily_trends <- aggregate(train_df_non_linear$price, by = list(train_df_non_linear$day_of_year), FUN = mean)
colnames(daily_trends) <- c("Day_of_Year", "Average_Price")
# Find the global maximum
global_max_daily <- daily_trends[which.max(daily_trends$Average_Price), ]
ggplot(daily_trends, aes(x = Day_of_Year, y = Average_Price)) +
geom_line() +
geom_text(data = global_max_daily
, aes(x = Day_of_Year, y = Average_Price, label = paste("Global Max:", round(Average_Price, 2))), vjust = -0.5) + # Add label for global maximum
labs(title = "Daily Trends in Average House Prices", x = "Day of Year", y = "Average Price", caption = generate_figure_caption("Daily Trends in Average House Prices", section = 5)) +
theme_minimal()

# Daily Trends in Count of Homes Sold
daily_counts <- table(train_df_non_linear$day_of_year)
daily_counts_df <- data.frame(Day_of_Year = as.numeric(names(daily_counts)), Count = as.numeric(daily_counts))
# Find the global maximum
global_max_count_daily <- daily_counts_df[which.max(daily_counts_df$Count), ]
ggplot(daily_counts_df, aes(x = Day_of_Year, y = Count)) +
geom_bar(stat = "identity") +
geom_text(data = global_max_count_daily, aes(x = Day_of_Year, y = Count, label = paste("Global Max:", Count)), vjust = -0.5) + # Add label for global maximum
labs(title = "Daily Trends in Count of Homes Sold", x = "Day of Year", y = "Count of Homes Sold", caption = generate_figure_caption("Daily Trends in Count of Homes Sold", section = 5)) +
theme_minimal()

- The “Daily Trends in Average House Prices” line chart showcases the
average house prices for each day of the year. It helps identify daily
patterns and potential price variations that could be influenced by
specific dates or events.
5.6 Geographical Influence Analysis
Investigating the spatial aspect by analyzing the
distance_to_convergence variable.
5.6.1 Distance to Convergence Point Map
# Calculate z-scores for the prices
train_df_non_linear <- train_df_non_linear %>%
mutate(z_score = scale(price))
# Define z-score intervals and corresponding colors
z_score_intervals <- seq(-3, 3, by = 1) # Create a sequence of z-scores from -3 to 3
color_sequence <- c("green", "#8fd744", "#fde725", "#f76818ff", "#d7301fff", "#440154") # From green to dark color
# Calculate price at each z-score interval
price_at_intervals <- sapply(z_score_intervals, function(z) {
mean(train_df_non_linear$price) + z * sd(train_df_non_linear$price)
})
# Ensure breaks are in ascending order and rounded to the nearest 25k
breaks <- sort(round(price_at_intervals / 25000) * 25000)
breaks <- c(min(train_df_non_linear$price, na.rm = TRUE), breaks, max(train_df_non_linear$price, na.rm = TRUE))
# If there are negative values or values that don't make sense, remove them
breaks <- breaks[breaks >= 0]
# Create color palette with a color for each interval
color_palette <- colorBin(color_sequence, domain = train_df_non_linear$price, bins = breaks, na.color = "#808080")
# Initialize the leaflet map with updated color palette
m <- leaflet(train_df_non_linear) %>%
addTiles() %>%
addCircleMarkers(
lat = ~lat, lng = ~long,
color = ~color_palette(price),
fillColor = ~color_palette(price),
fillOpacity = 0.8,
radius = 1, # Small dots
popup = ~paste("Price: $", formatC(price, format = "f", big.mark = ","), "<br>", "Z-Score: ", round(z_score, 2))
)
# Define the maximum distance for the distance bands
max_distance <- max(train_df_non_linear$distance_to_convergence, na.rm = TRUE)
# Add distance bands to the map
for (i in seq(2, max_distance, by = 2)) {
m <- addCircles(m, lat = convergence_point[1], lng = convergence_point[2], radius = i * 1000,
color = "grey", weight = 1, fill = FALSE, dashArray = "5, 5")
}
# Add legend and finalize the map
m <- m %>%
addLegend(
position = "bottomright",
pal = color_palette,
values = ~price,
title = "Price",
labFormat = labelFormat(prefix = "$"),
opacity = 1
) %>%
setView(lng = convergence_point[2], lat = convergence_point[1], zoom = 10)
cat(generate_figure_caption('Distance to Convergence Map', section = 5))
Figure 5.38 Distance to Convergence Map
5.6.2 Conclusion
This detailed review of the King County house sales dataset
underscores the thorough preparation undertaken for the predictive
analysis. The dataset’s diverse variables, both continuous and
categorical, have been meticulously processed and analyzed, providing a
robust foundation for developing the predictive model. With the
comprehensive EDA and graphical analysis, we gain valuable insights into
the correlations and distributions within the data, setting the stage
for effective model building and accurate house price prediction.
5.7 Removal of Plot Features, Correlation, Multicollinearity and NA
Values
# Drop columns created for visualizations in prior steps, columns that have high correlation, multicollinearity or NA values in the model
train_df_non_linear <- train_df_non_linear[, !colnames(train_df_non_linear) %in% c("view_category", "condition_category", "grade_category", "grade_category_numeric", "z_score", "lat", "long", 'sqft_above', 'month_sold', 'week_of_year', 'condition_3', "grade_Below_Average", "bedrooms_factor", "bathrooms_factor")]
train_df_linear <- train_df_linear[, !colnames(train_df_linear) %in% c("view_category", "condition_category", "grade_category", "grade_category_numeric", "z_score", "lat", "long", 'sqft_above', 'month_sold', 'week_of_year', 'condition_3', "grade_Below_Average", "bedrooms_factor", "bathrooms_factor")]
test_df_linear <- test_df_linear[, !colnames(test_df_linear) %in% c("view_category", "condition_category", "grade_category", "grade_category_numeric", "z_score", "lat", "long", 'sqft_above', 'month_sold', 'week_of_year', 'condition_3', "grade_Below_Average", "bedrooms_factor", "bathrooms_factor")]
test_df_non_linear <- test_df_non_linear[, !colnames(test_df_non_linear) %in% c("view_category", "condition_category", "grade_category", "grade_category_numeric", "z_score", "lat", "long", 'sqft_above', 'month_sold', 'week_of_year', 'condition_3', "grade_Below_Average", "bedrooms_factor", "bathrooms_factor")]
# Rebuild linear regression model before performing stepwise
linear_model_initial <- lm(price ~ ., data = train_df_linear)
# Add new coefficients to dataframe
coefficients_df <- create_coefficients_df(
model = linear_model_initial,
model_name = "OLS w/o corr",
coefficients_df = coefficients_df
)